# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

#' Read a CSV or other delimited file with Arrow
#'
#' These functions uses the Arrow C++ CSV reader to read into a `data.frame`.
#' Arrow C++ options have been mapped to argument names that follow those of
#' `readr::read_delim()`, and `col_select` was inspired by `vroom::vroom()`.
#'
#' `read_csv_arrow()` and `read_tsv_arrow()` are wrappers around
#' `read_delim_arrow()` that specify a delimiter.
#'
#' Note that not all `readr` options are currently implemented here. Please file
#' an issue if you encounter one that `arrow` should support.
#'
#' If you need to control Arrow-specific reader parameters that don't have an
#' equivalent in `readr::read_csv()`, you can either provide them in the
#' `parse_options`, `convert_options`, or `read_options` arguments, or you can
#' use [CsvTableReader] directly for lower-level access.
#'
#' @inheritParams make_readable_file
#' @param delim Single character used to separate fields within a record.
#' @param quote Single character used to quote strings.
#' @param escape_double Does the file escape quotes by doubling them?
#' i.e. If this option is `TRUE`, the value `""""` represents
#' a single quote, `\"`.
#' @param escape_backslash Does the file use backslashes to escape special
#' characters? This is more general than `escape_double` as backslashes
#' can be used to escape the delimiter character, the quote character, or
#' to add special characters like `\\n`.
#' @param col_names If `TRUE`, the first row of the input will be used as the
#' column names and will not be included in the data frame. If `FALSE`, column
#' names will be generated by Arrow, starting with "f0", "f1", ..., "fN".
#' Alternatively, you can specify a character vector of column names.
#' @param col_select A character vector of column names to keep, as in the
#' "select" argument to `data.table::fread()`, or a
#' [tidy selection specification][tidyselect::vars_select()]
#' of columns, as used in `dplyr::select()`.
#' @param na A character vector of strings to interpret as missing values.
#' @param quoted_na Should missing values inside quotes be treated as missing
#' values (the default) or strings. (Note that this is different from the
#' the Arrow C++ default for the corresponding convert option,
#' `strings_can_be_null`.)
#' @param skip_empty_rows Should blank rows be ignored altogether? If
#' `TRUE`, blank rows will not be represented at all. If `FALSE`, they will be
#' filled with missings.
#' @param skip Number of lines to skip before reading data.
#' @param parse_options see [file reader options][CsvReadOptions].
#' If given, this overrides any
#' parsing options provided in other arguments (e.g. `delim`, `quote`, etc.).
#' @param convert_options see [file reader options][CsvReadOptions]
#' @param read_options see [file reader options][CsvReadOptions]
#' @param as_data_frame Should the function return a `data.frame` (default) or
#' an Arrow [Table]?
#'
#' @return A `data.frame`, or a Table if `as_data_frame = FALSE`.
#' @export
#' @examples
#' \donttest{
#'   tf <- tempfile()
#'   on.exit(unlink(tf))
#'   write.csv(iris, file = tf)
#'   df <- read_csv_arrow(tf)
#'   dim(df)
#'   # Can select columns
#'   df <- read_csv_arrow(tf, col_select = starts_with("Sepal"))
#' }
read_delim_arrow <- function(file,
                             delim = ",",
                             quote = '"',
                             escape_double = TRUE,
                             escape_backslash = FALSE,
                             col_names = TRUE,
                             # col_types = TRUE,
                             col_select = NULL,
                             na = c("", "NA"),
                             quoted_na = TRUE,
                             skip_empty_rows = TRUE,
                             skip = 0L,
                             parse_options = NULL,
                             convert_options = NULL,
                             read_options = NULL,
                             as_data_frame = TRUE) {

  if (is.null(parse_options)) {
    parse_options <- readr_to_csv_parse_options(
      delim,
      quote,
      escape_double,
      escape_backslash,
      skip_empty_rows
    )
  }

  if (is.null(read_options)) {
    read_options <- readr_to_csv_read_options(skip, col_names)
  }
  if (is.null(convert_options)) {
    # TODO: col_types (needs wiring in CsvConvertOptions)
    convert_options <- readr_to_csv_convert_options(na, quoted_na)
  }

  if (is.string(file)) {
    file <- make_readable_file(file)
    on.exit(file$close())
  }
  reader <- CsvTableReader$create(
    file,
    read_options = read_options,
    parse_options = parse_options,
    convert_options = convert_options
  )

  tab <- reader$Read()$select(!!enquo(col_select))

  if (isTRUE(as_data_frame)) {
    tab <- as.data.frame(tab)
  }

  tab
}

#' @rdname read_delim_arrow
#' @export
read_csv_arrow <- function(file,
                           quote = '"',
                           escape_double = TRUE,
                           escape_backslash = FALSE,
                           col_names = TRUE,
                           # col_types = TRUE,
                           col_select = NULL,
                           na = c("", "NA"),
                           quoted_na = TRUE,
                           skip_empty_rows = TRUE,
                           skip = 0L,
                           parse_options = NULL,
                           convert_options = NULL,
                           read_options = NULL,
                           as_data_frame = TRUE) {

  mc <- match.call()
  mc$delim <- ","
  mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
  eval.parent(mc)
}

#' @rdname read_delim_arrow
#' @export
read_tsv_arrow <- function(file,
                           quote = '"',
                           escape_double = TRUE,
                           escape_backslash = FALSE,
                           col_names = TRUE,
                           # col_types = TRUE,
                           col_select = NULL,
                           na = c("", "NA"),
                           quoted_na = TRUE,
                           skip_empty_rows = TRUE,
                           skip = 0L,
                           parse_options = NULL,
                           convert_options = NULL,
                           read_options = NULL,
                           as_data_frame = TRUE) {

  mc <- match.call()
  mc$delim <- "\t"
  mc[[1]] <- get("read_delim_arrow", envir = asNamespace("arrow"))
  eval.parent(mc)
}

#' @title Arrow CSV and JSON table reader classes
#' @rdname CsvTableReader
#' @name CsvTableReader
#' @docType class
#' @usage NULL
#' @format NULL
#' @description `CsvTableReader` and `JsonTableReader` wrap the Arrow C++ CSV
#' and JSON table readers. See their usage in [read_csv_arrow()] and
#' [read_json_arrow()], respectively.
#'
#' @section Factory:
#'
#' The `CsvTableReader$create()` and `JsonTableReader$create()` factory methods
#' take the following arguments:
#'
#' - `file` A character path to a local file, or an Arrow input stream
#' - `convert_options` (CSV only), `parse_options`, `read_options`: see
#'    [CsvReadOptions]
#' - `...` additional parameters.
#'
#' @section Methods:
#'
#' - `$Read()`: returns an Arrow Table.
#'
#' @include arrow-package.R
#' @export
CsvTableReader <- R6Class("CsvTableReader", inherit = ArrowObject,
  public = list(
    Read = function() shared_ptr(Table, csv___TableReader__Read(self))
  )
)
CsvTableReader$create <- function(file,
                                  read_options = CsvReadOptions$create(),
                                  parse_options = CsvParseOptions$create(),
                                  convert_options = CsvConvertOptions$create(),
                                  ...) {
  file <- make_readable_file(file)
  shared_ptr(
    CsvTableReader,
    csv___TableReader__Make(file, read_options, parse_options, convert_options)
  )
}

#' @title File reader options
#' @rdname CsvReadOptions
#' @name CsvReadOptions
#' @docType class
#' @usage NULL
#' @format NULL
#' @description `CsvReadOptions`, `CsvParseOptions`, `CsvConvertOptions`,
#' `JsonReadOptions`, and `JsonParseOptions` are containers for various
#' file reading options. See their usage in [read_csv_arrow()] and
#' [read_json_arrow()], respectively.
#'
#' @section Factory:
#'
#' The `CsvReadOptions$create()` and `JsonReadOptions$create()` factory methods
#' take the following arguments:
#'
#' - `use_threads` Whether to use the global CPU thread pool
#' - `block_size` Block size we request from the IO layer; also determines
#' the size of chunks when use_threads is `TRUE`. NB: if `FALSE`, JSON input
#' must end with an empty line.
#'
#' `CsvReadOptions$create()` further accepts these additional arguments:
#'
#' - `skip_rows` Number of lines to skip before reading data (default 0)
#' - `column_names` Character vector to supply column names. If length-0
#' (the default), the first non-skipped row will be parsed to generate column
#' names, unless `autogenerate_column_names` is `TRUE`.
#' - `autogenerate_column_names` Logical: generate column names instead of
#' using the first non-skipped row (the default)? If `TRUE`, column names will
#' be "f0", "f1", ..., "fN".
#'
#' `CsvParseOptions$create()` takes the following arguments:
#'
#' - `delimiter` Field delimiting character (default `","`)
#' - `quoting` Logical: are strings quoted? (default `TRUE`)
#' - `quote_char` Quoting character, if `quoting` is `TRUE`
#' - `double_quote` Logical: are quotes inside values double-quoted? (default `TRUE`)
#' - `escaping` Logical: whether escaping is used (default `FALSE`)
#' - `escape_char` Escaping character, if `escaping` is `TRUE`
#' - `newlines_in_values` Logical: are values allowed to contain CR (`0x0d`)
#'    and LF (`0x0a`) characters? (default `FALSE`)
#' - `ignore_empty_lines` Logical: should empty lines be ignored (default) or
#'    generate a row of missing values (if `FALSE`)?
#'
#' `JsonParseOptions$create()` accepts only the `newlines_in_values` argument.
#'
#' `CsvConvertOptions$create()` takes the following arguments:
#'
#' - `check_utf8` Logical: check UTF8 validity of string columns? (default `TRUE`)
#' - `null_values` character vector of recognized spellings for null values.
#'    Analogous to the `na.strings` argument to
#'    [`read.csv()`][utils::read.csv()] or `na` in `readr::read_csv()`.
#' - `strings_can_be_null` Logical: can string / binary columns have
#'    null values? Similar to the `quoted_na` argument to `readr::read_csv()`.
#'    (default `FALSE`)
#'
#' @section Methods:
#'
#' These classes have no implemented methods. They are containers for the
#' options.
#'
#' @export
CsvReadOptions <- R6Class("CsvReadOptions", inherit = ArrowObject)
CsvReadOptions$create <- function(use_threads = option_use_threads(),
                                  block_size = 1048576L,
                                  skip_rows = 0L,
                                  column_names = character(0),
                                  autogenerate_column_names = FALSE) {
  shared_ptr(CsvReadOptions, csv___ReadOptions__initialize(
    list(
      use_threads = use_threads,
      block_size = block_size,
      skip_rows = skip_rows,
      column_names = column_names,
      autogenerate_column_names = autogenerate_column_names
    )
  ))
}

readr_to_csv_read_options <- function(skip, col_names) {
  if (isTRUE(col_names)) {
    # C++ default to parse is 0-length string array
    col_names <- character(0)
  }
  if (identical(col_names, FALSE)) {
    CsvReadOptions$create(skip_rows = skip, autogenerate_column_names = TRUE)
  } else {
    CsvReadOptions$create(skip_rows = skip, column_names = col_names)
  }
}

#' @rdname CsvReadOptions
#' @usage NULL
#' @format NULL
#' @docType class
#' @export
CsvParseOptions <- R6Class("CsvParseOptions", inherit = ArrowObject)
CsvParseOptions$create <- function(delimiter = ",",
                                   quoting = TRUE,
                                   quote_char = '"',
                                   double_quote = TRUE,
                                   escaping = FALSE,
                                   escape_char = '\\',
                                   newlines_in_values = FALSE,
                                   ignore_empty_lines = TRUE) {

  shared_ptr(CsvParseOptions, csv___ParseOptions__initialize(
    list(
      delimiter = delimiter,
      quoting = quoting,
      quote_char = quote_char,
      double_quote = double_quote,
      escaping = escaping,
      escape_char = escape_char,
      newlines_in_values = newlines_in_values,
      ignore_empty_lines = ignore_empty_lines
    )
  ))
}

readr_to_csv_parse_options <- function(delim = ",",
                                       quote = '"',
                                       escape_double = TRUE,
                                       escape_backslash = FALSE,
                                       skip_empty_rows = TRUE) {
  # This function translates from the readr argument list to the arrow arg names
  # TODO: validate inputs
  CsvParseOptions$create(
    delimiter = delim,
    quoting = nzchar(quote),
    quote_char = quote,
    double_quote = escape_double,
    escaping = escape_backslash,
    escape_char = '\\',
    newlines_in_values = escape_backslash,
    ignore_empty_lines = skip_empty_rows
  )
}

#' @rdname CsvReadOptions
#' @usage NULL
#' @format NULL
#' @docType class
#' @export
CsvConvertOptions <- R6Class("CsvConvertOptions", inherit = ArrowObject)
CsvConvertOptions$create <- function(check_utf8 = TRUE,
                                     null_values = c("", "NA"),
                                     strings_can_be_null = FALSE) {
  # TODO: there are more conversion options available:
  # // Optional per-column types (disabling type inference on those columns)
  # std::unordered_map<std::string, std::shared_ptr<DataType>> column_types;
  # // Recognized spellings for boolean values
  # std::vector<std::string> true_values;
  # std::vector<std::string> false_values;

  shared_ptr(CsvConvertOptions, csv___ConvertOptions__initialize(
    list(
      check_utf8 = check_utf8,
      null_values = null_values,
      strings_can_be_null = strings_can_be_null
    )
  ))
}

readr_to_csv_convert_options <- function(na, quoted_na) {
    CsvConvertOptions$create(null_values = na, strings_can_be_null = quoted_na)
}
